* Converting the PGSs files to stata format
* Dilnoza Muslimova 
* July 2020
* Last updated May 2021

* Add your username 
cd "projectfolder"

*** Rule for naming the scores: phenotype_sample_method
*** All scores with prior 1

{

{
********************************************************************************
*** Sumstats 44: EA NEW UKB, LDPRED 
********************************************************************************
clear all 
import delimited "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_p1.profile", delimiter(space, collapse) varnames(1)  
* 446,339 passed QC, european, and gave consent for the data 
sum 
* Check if the dataset starts with the correct UKB id 
* br 

* Keep relevant variables 
keep iid scoresum  
gen ea_new_ukb_ld=(-1)*scoresum
drop scoresum
rename iid id_ukb

save "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_p1.dta", replace 
}

{
********************************************************************************
*** Sumstats 45: EA NEW UKB, LDPRED, sample 0
********************************************************************************
clear all 
import delimited "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample0_p1.profile", delimiter(space, collapse) varnames(1)  
* 446,339 passed QC, european, and gave consent for the data 
sum 
* Check if the dataset starts with the correct UKB id 
* br 

* Keep relevant variables 
keep iid scoresum  
gen ea_new_ukb_ld_0=(-1)*scoresum
drop scoresum
rename iid id_ukb

save "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample0_p1.dta", replace 
}

{
********************************************************************************
*** Sumstats 46: EA NEW UKB, LDPRED, sample 1
********************************************************************************
clear all 
import delimited "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample1_p1.profile", delimiter(space, collapse) varnames(1)  
* 446,339 passed QC, european, and gave consent for the data 
sum 
* Check if the dataset starts with the correct UKB id 
* br 

* Keep relevant variables 
keep iid scoresum  
gen ea_new_ukb_ld_1=(-1)*scoresum
drop scoresum
rename iid id_ukb

save "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample1_p1.dta", replace 
}
}
{
********************************************************************************
*** Merge all PGS into one set & add Norface ID 
********************************************************************************
clear all

use "Analysis\Input\ID_Norface_ID_UKB_key.dta"
***
merge 1:1 id_ukb using "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_p1.dta", nogen
merge 1:1 id_ukb using "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample0_p1.dta", nogen
merge 1:1 id_ukb using "Analysis\Input\PGS_ldpred_UKB_EA_new_nosibsrel_sample1_p1.dta", nogen

}

{
*** Standardize scores within the respective holdout samples 
global pgs ea_new_ukb_ld          ea_new_ukb_ld_0        ea_new_ukb_ld_1 
		    

foreach i in $pgs {
	sum `i', detail
	sca m = r(mean)
	sca sd = r(sd)
	replace `i' = (`i' - m)/sd
	}

** Labelling the pgs, keep an eye on the holdout sample 
label var ea_new_ukb_ld "standardized EA new pgs, UKB nosibrels, ldpred"	
label var ea_new_ukb_ld_0 "standardized splitsample EA new pgs, UKB nosibpcrels, ldpred"		   
label var ea_new_ukb_ld_1 "standardized splitsample EA new pgs, UKB nosibpcrels, ldpred"

}

{
********************************************************************************
*** Adding Phenotypes, PCs, and Relatedness 
********************************************************************************
duplicates tag ID, generate(dup)
br if dup>0
drop if ID<0
drop if ID==.

merge 1:1 ID using "Analysis\Input\ExtractedData.dta", gen(_mergeExtDt)

/*

    Result                           # of obs.
    -----------------------------------------
    not matched                        15,233
        from master                        23  (_mergeExtDt==1)
        from using                     15,210  (_mergeExtDt==2)

    matched                           487,297  (_mergeExtDt==3)
    -----------------------------------------

*/

keep if _mergeExtDt==3
*(15,233 observations deleted)

// Merge with PCs
merge 1:1 ID using "Analysis\Input\PGSs_PCs_Ancestry.dta", gen(_mergePC)
/*
        
    Result                           # of obs.
    -----------------------------------------
    not matched                        15,299
        from master                         0  (_mergePC==1)
        from using                     15,299  (_mergePC==2)

    matched                           487,297  (_mergePC==3)
    -----------------------------------------


*/
keep if _mergePC==3
*(15,299 observations deleted)

// Merge sibling identifiers
* Need to keep only sibs and possible their relatives to test the predictive power of the score, can't apply to everyone, will overfit 
merge 1:1 ID using "Relatedness_to_siblings_UKB.dta", gen(_mergeRL)
/*

    Result                           # of obs.
    -----------------------------------------
    not matched                       339,805
        from master                   339,792  (_mergeRL==1)
        from using                         13  (_mergeRL==2)

    matched                           147,505  (_mergeRL==3)
    -----------------------------------------

*/

merge 1:1 ID using "Analysis\Input\w41382_20210201_withdrew_consent.dta", gen(consent)
drop if consent>1
merge 1:1 ID using "Analysis\Input\w41382_20200820_withdrew_consent.dta", gen(consent2)
drop if consent2>1
merge 1:1 ID using "Analysis\Input\original_EA.dta", gen(EA2)
keep if EA2==3

/* Construct EA phenotype 
rename c_quals_* n_6138_*   
gen EduYears = .
replace EduYears = 7 if n_6138_0_0 == -7
replace EduYears = 7 if n_6138_0_0 == .
replace EduYears = 20 if n_6138_0_0 == 1
replace EduYears = 13 if n_6138_0_0 == 2
replace EduYears = 10 if n_6138_0_0 == 3
replace EduYears = 10 if n_6138_0_0 == 4
replace EduYears = 19 if n_6138_0_0 == 5
replace EduYears = 15 if n_6138_0_0 == 6

//Take EduYears from 2nd round of measurment if not available in 1st round
replace EduYears = 7 if (n_6138_1_0 == -7 & EduYears ==.)
replace EduYears = 20 if (n_6138_1_0 == 1 & EduYears ==.)
replace EduYears = 13 if (n_6138_1_0 == 2 & EduYears ==.)
replace EduYears = 10 if (n_6138_1_0 == 3 & EduYears ==.)
replace EduYears = 10 if (n_6138_1_0 == 4 & EduYears ==.)
replace EduYears = 19 if (n_6138_1_0 == 5 & EduYears ==.)
replace EduYears = 15 if (n_6138_1_0 == 6 & EduYears ==.)

//Take EduYears from 3th round of measurment if not available in 2nd round
replace EduYears = 7 if (n_6138_2_0 == -7 & EduYears ==.)
replace EduYears = 20 if (n_6138_2_0 == 1 & EduYears ==.)
replace EduYears = 13 if (n_6138_2_0 == 2 & EduYears ==.)
replace EduYears = 10 if (n_6138_2_0 == 3 & EduYears ==.)
replace EduYears = 10 if (n_6138_2_0 == 4 & EduYears ==.)
replace EduYears = 19 if (n_6138_2_0 == 5 & EduYears ==.)
replace EduYears = 15 if (n_6138_2_0 == 6 & EduYears ==.)

*/

sum EduYears
rename EduYears EA 
/* 
		  
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
    EduYears |    482,073    13.93873    5.140376          7         20



*/

*** NEW METHOD: MAX over each wave, then replace the missings) - with distinction between -7 "None of the above" -3 "Prefer not to answer"
sum n_6138_*
forval num = 0/5 {
gen EA_new_`num' = n_6138_0_`num'
recode EA_new_`num' -7=7 -3=. 1=20 2=13 3=10 4=10 5=19 6=15 
}
egen EA_new = rowmax(EA_new*)

forval num = 0/5 {
gen EA1_new_`num' = n_6138_1_`num'
recode EA1_new_`num' -7=7 -3=. 1=20 2=13 3=10 4=10 5=19 6=15 
}
egen EA_1_new = rowmax(EA1_new*)

forval num = 0/5 {
gen EA2_new_`num' = n_6138_2_`num'
recode EA2_new_`num' -7=7 -3=. 1=20 2=13 3=10 4=10 5=19 6=15 
}
egen EA_2_new = rowmax(EA2_new*)

replace EA_new = EA_1_new if EA_new == .
replace EA_new = EA_2_new if EA_new == .
sum EA_new

/*    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
      EA_new |    481,636    14.91878    5.115882          7         20
*/

tab relationship
	
/* 
   relationship to a sibling, 0 -|
          unrelated to a sibling |      Freq.     Percent        Cum.
---------------------------------+-----------------------------------
         Not related to siblings |     91,068       61.73       61.73
                    full sibling |     41,502       28.13       89.87
2nd or 3rd Relative of a sibling |     10,208        6.92       96.79
       Parent/child of a sibling |      4,740        3.21      100.00
---------------------------------+-----------------------------------
                           Total |    147,518      100.00


*/	

}

{
********************************************************************************
*** Saving the complete dataset 
********************************************************************************
keep ID EA EA_new ea_new_ukb_ld          ea_new_ukb_ld_0        ea_new_ukb_ld_1 ///
		   YoB MoB sex e_PC_1-e_PC_40 famid relationship 
		   

save "Analysis\Input\PGS_ldpred_plink_EA_height_cvd_bmi.dta", replace 

}

*end of do-file
